Initial data and analysis set up

#import libraries for...
#plotting
require(gplots)
require(ggplot2)
require(ggthemes)
require(scales)
require(grid)
#modelling
require(car)
require(fitdistrplus)
require(lme4)
require(nlme)
#data wrangl'n
require(gdata)
require(reshape2)
require(data.table)
require(dplyr)
require(tidyr)
require(purrr)

sessionInfo() #for reproducibility
## R version 3.2.0 (2015-04-16)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Fedora 20 (Heisenbug)
## 
## locale:
##  [1] LC_CTYPE=en_CA.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_CA.UTF-8    
##  [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_CA.UTF-8   
##  [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] purrr_0.2.1        tidyr_0.4.1        dplyr_0.4.3       
##  [4] data.table_1.9.6   reshape2_1.2.2     gdata_2.17.0      
##  [7] nlme_3.1-128       lme4_1.1-12        Matrix_1.2-0      
## [10] fitdistrplus_1.0-6 car_2.0-16         nnet_7.3-9        
## [13] MASS_7.3-40        scales_0.4.0       ggthemes_3.0.3    
## [16] ggplot2_2.1.0      gplots_3.0.1      
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.3        formatR_1.3        nloptr_1.0.4      
##  [4] plyr_1.8           bitops_1.0-6       tools_3.2.0       
##  [7] digest_0.6.4       evaluate_0.8.3     gtable_0.1.2      
## [10] lattice_0.20-31    DBI_0.3.1          parallel_3.2.0    
## [13] yaml_2.1.13        stringr_0.6.2      knitr_1.12.3      
## [16] gtools_3.5.0       caTools_1.17.1     R6_2.1.2          
## [19] survival_2.38-1    rmarkdown_0.9.5    minqa_1.2.4       
## [22] magrittr_1.5       htmltools_0.3.5    splines_3.2.0     
## [25] assertthat_0.1     colorspace_1.2-4   KernSmooth_2.23-14
## [28] munsell_0.4.2      chron_2.3-47
original_par <- par() #for resetting to original par after generating the plot in the null device

Data import

import a variety of experimental done with non-standard annoation. they are all done according to the protocol in the methods, but the conditions are frequently changed

import the data from csv

raw_data1 <- read.csv('Data/crystal_violet_biofilm/160524_cv_merge.csv',comment.char = '#')
raw_data2 <- read.csv('Data/crystal_violet_biofilm/160316_cv_mutants_full1.csv',comment.char = '#')
raw_data2[,"date"] <- rep(160316, 240) # add dates for those instances it does not exist
raw_data3 <- read.csv('Data/crystal_violet_biofilm/160324_cv_mutants_full2.csv', comment.char = '#')
raw_data3[,"date"] <- rep(160324, 528)
raw_data3 <- filter(raw_data3, plate != 'p8', plate != 'p7', plate != 'p6', plate != 'p5') #those plates are diluted repeats
raw_data4 <- read.csv('Data/crystal_violet_biofilm/160326_cv_mutants_full3.csv', comment.char = '#')
raw_data4[,"date"] <- rep(160326, 264)
raw_data5 <- read.csv('Data/crystal_violet_biofilm/160505_cv_o2.csv', comment.char = '#')
raw_data5[,"date"] <- rep(160505, 264)
raw_data6 <- read.csv('Data/crystal_violet_biofilm/160414_cv_mutants_method.csv', comment.char = '#')
raw_data6[,"date"] <- rep(160414, 144)
raw_data7 <- read.csv('Data/crystal_violet_biofilm/160416_cv_mutants_method2.csv', comment.char = '#')
raw_data7[,"date"] <- rep(160416, 198)

Data processing

the raw data is melted and reshaped standardizing the column names and entries (somewhat) to make it easier to work with

data_list <- list(raw_data1, raw_data2, raw_data3, raw_data4, raw_data5, raw_data6, raw_data7)
melt_pool <- melt(data_list[1], id.vars = c('sample_strain', 'sample_id', 'location', 'date'))
for(i in 2:length(data_list)){
  m <- melt(data_list[i], id.vars = c('sample_strain', 'sample_id', 'location', 'date'))
  melt_pool <- rbind(melt_pool, m)
  #  str(m) for debug
} #merge molten data
cast_data <- dcast(melt_pool, location + sample_strain + sample_id + date ~ variable, fill = "not_reported") #recast
cast_data <- data.table(cast_data) #first data.table useage!
cast_data[sample_strain == "", sample_strain := "LAC"] #update those rows with sample_strain=="" to "LAC"
cast_data[media == "not_reported", media := "TSB"] #absent media is TSB
cast_data[coating == "not_reported" | coating == "", coating := "none"] #absent coating is none
cast_data[O2 == "not_reported", O2 := "aerobic"] #absent O2 is TSB
cast_data[date == "160326", drop := 'true'] #remove these based on their lack of useable data (found when looking at the raw_platemap)
#EDIT after investigation remove all of 160326
cast_data[plate == "p3" & date == "160324", drop := 'true']
cast_data[sample_strain == "LAC" & sample_id == "empty", sample_strain := "empty"] #fix up this improper labelling
cast_data[drop == "not_reported", drop := "F"]
str(cast_data)
## Classes 'data.table' and 'data.frame':   1662 obs. of  13 variables:
##  $ location     : Factor w/ 540 levels "p1b1","p1b10",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ sample_strain: Factor w/ 6 levels "22251","empty",..: 2 2 2 2 3 3 3 3 3 3 ...
##  $ sample_id    : Factor w/ 40 levels "","agrA_C123F_20",..: 1 1 1 7 12 12 12 3 4 5 ...
##  $ date         : num  160505 160508 160511 160414 160324 ...
##  $ plate        : chr  "p1" "p1" "p1" "p1" ...
##  $ well         : chr  "b1" "b1" "b1" "b1" ...
##  $ media        : chr  "M9" "TSB" "TSB" "0.5_glu_3_nacl" ...
##  $ drop         : chr  "F" "F" "F" "F" ...
##  $ dilution     : chr  "not_reported" "0.1" "0.1" "1" ...
##  $ O2           : chr  "anaerobic" "aerobic" "anaerobic" "aerobic" ...
##  $ OD           : chr  "0.166" "0.1254000068" "0.1400000006" "0.139200002" ...
##  $ coating      : chr  "none" "none" "none" "none" ...
##  $ fixation     : chr  "not_reported" "not_reported" "not_reported" "60C" ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "index")= int

the data is now flagged and the types are changed to work better with downstream exploration and visulaization

working_data <- cast_data %>%
  transform(date = as.factor(date),
            plate = as.factor(plate),
            coating = as.factor(coating),
            OD = as.numeric(OD)) %>% #convert date, plate to a factor and OD into numeric, merge the mutants together
  filter(sample_strain == 'LAC' | sample_strain == "empty",
         media == "TSB" | media == "0.5_glu",
         O2 == "aerobic") %>% #extract only the data for which there will be comparative rafts, also exclude media comparisons
  mutate(row = as.factor(ifelse(grepl("[a-g][0-9]{1,2}", as.character(well)) == TRUE,
                                sub("([a-g])[0-9]{1,2}", "\\1", as.character(well)),
                                "error")),
         col = as.factor(ifelse(grepl("[a-g][0-9]{1,2}", as.character(well)) == TRUE,
                                sub("[a-g]([0-9]{1,2})", "\\1", as.character(well)),
                                "error")), #process and flag data with positional information
         drop = as.logical(drop)) %>%
  separate(sample_id, c("genetic_background", "exogenous_genetic"), sep = "_[KOermCF123]{0,5}_?",
           extra = "merge", remove = FALSE) #separate the sample_id into components
str(working_data)
## Classes 'data.table' and 'data.frame':   888 obs. of  17 variables:
##  $ location          : Factor w/ 540 levels "p1b1","p1b10",..: 1 1 1 1 2 2 2 2 2 3 ...
##  $ sample_strain     : Factor w/ 6 levels "22251","empty",..: 2 3 3 3 3 3 3 3 3 2 ...
##  $ sample_id         : Factor w/ 40 levels "","agrA_C123F_20",..: 1 12 12 12 4 8 23 23 38 1 ...
##  $ genetic_background: chr  "" "wt" "wt" "wt" ...
##  $ exogenous_genetic : chr  NA NA NA NA ...
##  $ date              : Factor w/ 8 levels "160316","160324",..: 7 2 3 5 5 7 2 3 4 1 ...
##  $ plate             : Factor w/ 4 levels "p1","p2","p3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ well              : chr  "b1" "b1" "b1" "b1" ...
##  $ media             : chr  "TSB" "TSB" "TSB" "0.5_glu" ...
##  $ drop              : logi  FALSE FALSE TRUE FALSE FALSE TRUE ...
##  $ dilution          : chr  "0.1" "0.1" "1" "1" ...
##  $ O2                : chr  "aerobic" "aerobic" "aerobic" "aerobic" ...
##  $ OD                : num  0.125 0.372 0.536 2.898 2.907 ...
##  $ coating           : Factor w/ 3 levels "0.01_FBS","FBS",..: 3 1 1 3 3 3 1 1 3 1 ...
##  $ fixation          : chr  "not_reported" "60C" "60C" "60C" ...
##  $ row               : Factor w/ 6 levels "b","c","d","e",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ col               : Factor w/ 12 levels "1","10","11",..: 1 1 1 1 2 2 2 2 2 3 ...

from the separate function it’s clear that there needs to be some reworking of the categories queries are made and categories simplified/merged

setkey(working_data, "exogenous_genetic")
unique(working_data[,.(exogenous_genetic)]) # we need to clean up this category
unique(working_data[NA_character_,.(exogenous_genetic, genetic_background, sample_strain, sample_id)]) #what are the entries which have NA?
unique(working_data[exogenous_genetic == "",.(exogenous_genetic, genetic_background, sample_strain, sample_id)])
unique(working_data["20",.(exogenous_genetic, genetic_background, sample_strain, sample_id)]) # 1st working through the list
working_data["20", exogenous_genetic := "pos1_agrA_20"]
unique(working_data[exogenous_genetic == "pos1_empty",.(exogenous_genetic, genetic_background, sample_strain, sample_id)])
unique(working_data[exogenous_genetic == "empty",.(exogenous_genetic, genetic_background, sample_strain, sample_id)])
working_data[exogenous_genetic == "empty", exogenous_genetic := "pos1_empty"]
unique(working_data[exogenous_genetic == "37" | exogenous_genetic == "77",.(exogenous_genetic, genetic_background, sample_strain, sample_id)])
working_data[exogenous_genetic == "37" | exogenous_genetic == "77", exogenous_genetic := ""]
unique(working_data[exogenous_genetic == "37_empty" | exogenous_genetic == "77_empty",.(exogenous_genetic, genetic_background, sample_strain, sample_id)])
working_data[exogenous_genetic == "37_empty" | exogenous_genetic == "77_empty", exogenous_genetic := "pos1_empty"]
unique(working_data[exogenous_genetic == "37_comp",.(exogenous_genetic, genetic_background, sample_strain, sample_id)])
working_data[exogenous_genetic == "37_comp", exogenous_genetic := "pos1_atl"]
unique(working_data[exogenous_genetic == "comp20" | exogenous_genetic == "77_pos1_20",.(exogenous_genetic, genetic_background, sample_strain, sample_id)])
working_data[exogenous_genetic == "comp20" | exogenous_genetic == "77_pos1_20", exogenous_genetic := "pos1_agrA_20"]
unique(working_data[exogenous_genetic == "comp",.(exogenous_genetic, genetic_background, sample_strain, sample_id)])
working_data[exogenous_genetic == "comp" & genetic_background == "atl", exogenous_genetic := "pos1_atl"
             ][exogenous_genetic == "comp" & genetic_background == "ica", exogenous_genetic := "pos1_ica"
               ][exogenous_genetic == "comp" & genetic_background == "srtA", exogenous_genetic := "pos1_srtA"]
unique(working_data[,.(genetic_background)]) # check the other separated column
unique(working_data[genetic_background == "",.(exogenous_genetic, genetic_background, sample_strain, sample_id)])
working_data[, simple_id := as.factor(paste(genetic_background, exogenous_genetic, sep = "_"))]

some last final cleaning

working_data[date == "160324" & col == "6", drop := TRUE]
working_data <- filter(working_data, drop == FALSE)

Quick visulaization

the plotted raw OD’s in the style of Tecan output

raw_platemap <- ggplot(working_data, aes(col, row, label = sample_id)) +
  geom_raster(aes(fill=OD)) +
  geom_text(aes(colour = sample_strain),fontface = "bold", size = 3, angle = -45) +
  facet_grid(plate~date)
raw_platemap

Normalize

because the “biofilms” are sensitive to washing I am normalizing them to the “empty” (background) well and “wt” (proportional constant) well in each row if there are more than one wt samples of the same strain: the “wt” wells are averaged per row

#add the OD_adjusted column which takes the "empty" in the same row and subtracts it from the rest of the values then divde that by the adjusted "wt" value in the same row
#to test the function: x <- filter(working_data, sample_id =="wt")
norm_data <- working_data[sample_strain == "empty", empty_well_values := OD
                          ][, OD_backgroundsub := OD - mean(empty_well_values, na.rm = TRUE), keyby = .(date, plate, row)
                            ][sample_id == "wt", wt_well_values := OD_backgroundsub
                              ][, OD_adjusted := OD_backgroundsub / mean(wt_well_values, na.rm = TRUE), keyby = .(date,plate,row)
                                ][, id_merge := as.factor(paste(date, genetic_background, "KO", exogenous_genetic, sep = '_')) ] # and make a merged id
norm_data <- norm_data[!OD_adjusted < 0,] #some anomolusly low OD_adjusted values
max_OD <- max(norm_data$OD_adjusted)
min_OD <- min(norm_data$OD_adjusted)
norm_data <- norm_data %>%
  mutate(OD_log = ifelse(OD_adjusted > 0, log10(OD_adjusted), 0),
         OD_scale = (OD_adjusted - min_OD)/(max_OD - min_OD))%>% #use the log base 10 and scale to 0-1
  filter(sample_strain != "empty", drop == FALSE)
norm_data <- norm_data[!OD_adjusted < 0] #get rid of some wierd under 0 points (what are they exactly?)
norm_platemap <- ggplot(norm_data, aes(col, row, label = sample_id)) +
  geom_raster(aes(fill=OD_adjusted)) +
  geom_text(fontface = "bold", size = 3, angle = -45) +
  facet_grid(plate~date)
norm_platemap

Dirty group comparison

since I am going to exclude the anaerobic data in the later part of the this analysis here I will look at it

summary1plot <- ggplot(norm_data, aes(sample_id, OD_adjusted)) +
  scale_colour_brewer(palette = "YlOrRd") +
  geom_boxplot(outlier.shape=NA) +
  geom_point(aes(shape = plate, colour = date), position = "jitter") +
  theme(axis.text.x=element_text(angle = -90, hjust = 0)) +
  scale_y_log10()
summary1plot #adjusted OD by detailed groupings

summary2plot <- ggplot(norm_data, aes(simple_id, OD)) +
  scale_shape_manual(values = c(1,16,7)) +
  scale_colour_brewer(palette = "YlOrRd") +
  geom_boxplot(outlier.shape=NA) +
  geom_point(aes(shape = coating, colour = date), size = 3, position = "jitter") +
  theme(axis.text.x=element_text(angle = -90, hjust = 0)) +
  scale_y_log10()
summary2plot #OD by simple groupings

since there is some variability ove rtime with the absolute OD of the values (also because I am including FBS and non FBS treated wells) I will be using the normalized values for all analyses.

summary3plot <- ggplot(norm_data, aes(simple_id, OD_adjusted)) +
  scale_shape_manual(values = c(1,16,7)) +
  scale_colour_brewer(palette = "YlOrRd") +
  geom_boxplot(outlier.shape=NA) +
  geom_point(aes(shape = coating, colour = date), size = 3, position = "jitter") +
  theme(axis.text.x=element_text(angle = -90, hjust = 0)) +
  scale_y_log10()
summary3plot #adjusted OD by simple groupings

Some retrospective fixes

some deviation from 1 for wt is OK considering that there are sometimes two wt observations and drift between them…

but excessive deviation warrants some investigation. It seems like the 160326 and 160324 experiments are driving this therefore the investiagtion will begin there

after this analysis was borne out, I made the changes by adding a drop column and removing the offending data from the larger set

lots of variation in the empty wells

of the plates in 160326 plate 1 has the highest disagreement between wt, then 2, 1

unique(norm_data[simple_id == "wt_NA" & date == "160326", .(date, plate, row, OD_backgroundsub, OD, wt_well_values, OD_adjusted)])
## Empty data.table (0 rows) of 7 cols: date,plate,row,OD_backgroundsub,OD,wt_well_values...

of the plates in 160324 plate 4 has the highest disagreement between wt, then 2, 1 also edge wells are responsible

unique(norm_data[simple_id == "wt_NA" & date == "160324", .(date, plate, row, OD_backgroundsub, OD, wt_well_values, OD_adjusted)])
##       date plate row OD_backgroundsub     OD wt_well_values OD_adjusted
##  1: 160324    p1   b           0.2751 0.3720         0.2751           1
##  2: 160324    p1   c           0.2646 0.3645         0.2646           1
##  3: 160324    p1   d           0.3340 0.4259         0.3340           1
##  4: 160324    p1   e           0.4417 0.5432         0.4417           1
##  5: 160324    p1   f           0.3804 0.4757         0.3804           1
##  6: 160324    p1   g           0.3280 0.4279         0.3280           1
##  7: 160324    p2   b           0.2492 0.3375         0.2492           1
##  8: 160324    p2   c           0.2403 0.3305         0.2403           1
##  9: 160324    p2   d           0.2929 0.3901         0.2929           1
## 10: 160324    p2   e           0.3177 0.4384         0.3177           1
## 11: 160324    p2   f           0.3694 0.4643         0.3694           1
## 12: 160324    p2   g           0.3296 0.4442         0.3296           1
## 13: 160324    p4   b           0.2372 0.3285         0.2372           1
## 14: 160324    p4   c           0.1344 0.2238         0.1344           1
## 15: 160324    p4   d           0.2084 0.3043         0.2084           1
## 16: 160324    p4   e           0.1832 0.2718         0.1832           1
## 17: 160324    p4   f           0.2260 0.3345         0.2260           1
## 18: 160324    p4   g           0.1805 0.2936         0.1805           1

in the end I think we will exclude the experiments of 160326 and pre average wt values from 160324

data_summary <- norm_data %>%
  group_by(simple_id) %>%
  summarise(
    mean = mean(OD_adjusted, na.rm = TRUE), # means comparison
    sdev = sd(OD_adjusted, na.rm = TRUE),
    ci_lower = t.test(OD_adjusted)$conf.int[1], #95% confidence intervals CANT DO IT CAUSE THE DATA?
    ci_upper = t.test(OD_adjusted)$conf.int[2])
data_summary
## Source: local data table [15 x 5]
## 
##             simple_id      mean       sdev  ci_lower  ci_upper
##                (fctr)     (dbl)      (dbl)     (dbl)     (dbl)
## 1               wt_NA 1.0000000 0.04302054 0.9912832 1.0087168
## 2               agrA_ 4.5587988 2.17442334 3.8230798 5.2945178
## 3                atl_ 0.6362879 0.45641154 0.5007504 0.7718254
## 4               icaA_ 1.2007692 0.78851209 0.8678097 1.5337286
## 5               srtA_ 0.8175688 0.52966599 0.5885240 1.0466135
## 6        atl_pos1_atl 1.5994725 0.59746574 1.4451308 1.7538143
## 7      atl_pos1_empty 0.5913342 0.34650740 0.5018219 0.6808466
## 8     icaA_pos1_empty 1.0717197 0.25459767 0.9616234 1.1818160
## 9     srtA_pos1_empty 1.7692307 0.81503527 1.4250715 2.1133899
## 10      icaA_pos1_ica 1.2929331 0.35655355 1.0663897 1.5194764
## 11     srtA_pos1_srtA 1.8293410 0.70594900 1.5312449 2.1274371
## 12  agrA_pos1_agrA_20 1.1847146 0.42873552 1.0739604 1.2954687
## 13 agrA_pos1_agrA_min 1.6518515 0.55214205 1.3010371 2.0026658
## 14    agrA_pos1_empty 1.7175058 0.53097691 1.5803399 1.8546716
## 15       ica_pos1_ica 1.1435963 0.20516336 1.0057656 1.2814269
pos = position_dodge(width = 0.9)#for error bars to dodge dodging columns
data_summary_plot <- ggplot(data_summary, aes(simple_id, mean, ymin = ci_lower, ymax = ci_upper)) +
  geom_bar(aes(fill = simple_id), stat="identity", position = pos, width = 0.9) +
  theme(axis.text.x=element_text(angle = -90, hjust = 0)) +
  geom_errorbar(aes(fill = simple_id), width = 0.2, position = pos)
data_summary_plot

Assumptions Testing

Explore distributions

OD_aggregate <- aggregate(cbind(OD, OD_adjusted, OD_log, OD_scale) ~ simple_id,
                          data = norm_data,
                          print) #the OD for each pariwise combinations, also required for Cliff's DELTA
xselect <- combn(OD_aggregate[["simple_id"]], 2)
names(OD_aggregate$OD_adjusted) <- OD_aggregate$simple_id
par(mfcol = c(5,3), mar = c(2,2,2,1))
distribution_explore <- lapply(OD_aggregate$OD_adjusted, function(x) descdist(x, boot = 1000))

par(original_par)
fit_norm <- lapply(OD_aggregate$OD_adjusted, function(z) fitdist(z, "norm", method = "mle"))
fit_lnorm <- lapply(OD_aggregate$OD_adjusted, function(z) fitdist(z, "lnorm", method = "mle"))
fit_exp <- lapply(OD_aggregate$OD_adjusted, function(z) fitdist(z, "exp", method = "mle"))
fit_gamma <- lapply(OD_aggregate$OD_adjusted, function(z) fitdist(z, "gamma", method = "mle"))
fit_weibull <- lapply(OD_aggregate$OD_adjusted, function(z) fitdist(z, "weibull", method = "mle"))
fit_unif <- lapply(OD_aggregate$OD_adjusted, function(z) fitdist(z, "unif", method = "mle"))
fits <- c(exp = fit_exp, gamma = fit_gamma, norm = fit_norm, lnorm = fit_lnorm, unif = fit_unif, weibull = fit_weibull) #fits using untransformed data are combined
AIC_extract <- list(names(fit_norm), c("exp", "gamma", "norm", "lnorm", "unif", "weibull"))
AIC_summary <- get_AIC(fits) %>%
  matrix(15,6, dimnames = AIC_extract)
AIC_summary
##                          exp        gamma         norm       lnorm unif
## agrA_              183.22826  158.5988840  161.0763948  160.673220   NA
## agrA_pos1_agrA_20  142.34022   48.4733594   71.6343914   38.878490   NA
## agrA_pos1_agrA_min  38.04552   20.8734071   22.7555900   20.227854   NA
## agrA_pos1_empty    186.90477   87.8028307   97.2997877   84.973206   NA
## atl_                52.40642   38.0290254   61.3701602   35.090585   NA
## atl_pos1_atl       178.36087  103.7874917  111.4571957  102.608745   NA
## atl_pos1_empty      58.95513   29.9584103   46.0820633   30.949991   NA
## ica_pos1_ica        26.95191   -0.4599682   -0.6786362   -0.261482   NA
## icaA_               58.78219   50.3178181   59.6824567   48.848643   NA
## icaA_pos1_empty     51.18617    5.7356090    5.3175281    6.420402   NA
## icaA_pos1_ica       32.16592   11.1203637   12.2598884   10.837000   NA
## srtA_               38.73467   31.6943156   39.0153828   30.839113   NA
## srtA_pos1_empty     77.38615   61.2344188   61.2704718   63.038530   NA
## srtA_pos1_srtA      78.98988   51.5104067   54.3734294   51.341317   NA
## wt_NA              194.00000 -328.0992512 -328.6159285 -327.619426   NA
##                        weibull
## agrA_               157.954380
## agrA_pos1_agrA_20    70.291659
## agrA_pos1_agrA_min   22.619058
## agrA_pos1_empty      98.317435
## atl_                 41.583302
## atl_pos1_atl        108.631569
## atl_pos1_empty       32.501154
## ica_pos1_ica         -1.044914
## icaA_                51.984351
## icaA_pos1_empty       5.129984
## icaA_pos1_ica        12.734397
## srtA_                32.618793
## srtA_pos1_empty      59.804556
## srtA_pos1_srtA       53.220306
## wt_NA              -299.804127
#the coordinates of the subset of the dat you want to look at to assess fit
#wt
index1 <- 15
legend <- c("gamma", "normal", "lognormal", "weibull") #legend text
data_subset <- list(fit_gamma[[index1]], fit_norm[[index1]], fit_lnorm[[index1]], fit_weibull[[index1]]) #to compare the normal to lognormal and weibull
par(mfcol = c(2,2), mar = c(2,2,2,1))
denscomp(data_subset, legendtext = legend)
qqcomp(data_subset, legendtext = legend)
cdfcomp(data_subset, legendtext = legend)
ppcomp(data_subset, legendtext = legend)

par(original_par)
#the coordinates of the subset of the dat you want to look at to assess fit
#agrA_
index1 <- 1
legend <- c("gamma", "normal", "lognormal", "weibull") #legend text
data_subset <- list(fit_gamma[[index1]], fit_norm[[index1]], fit_lnorm[[index1]], fit_weibull[[index1]]) #to compare the normal to lognormal and weibull
par(mfcol = c(2,2), mar = c(2,2,2,1))
denscomp(data_subset, legendtext = legend)
qqcomp(data_subset, legendtext = legend)
cdfcomp(data_subset, legendtext = legend)
ppcomp(data_subset, legendtext = legend)

par(original_par)
#the coordinates of the subset of the dat you want to look at to assess fit
#agrA_pos_20
index1 <- 2
legend <- c("gamma", "normal", "lognormal", "weibull") #legend text
data_subset <- list(fit_gamma[[index1]], fit_norm[[index1]], fit_lnorm[[index1]], fit_weibull[[index1]]) #to compare the normal to lognormal and weibull
par(mfcol = c(2,2), mar = c(2,2,2,1))
denscomp(data_subset, legendtext = legend)
qqcomp(data_subset, legendtext = legend)
cdfcomp(data_subset, legendtext = legend)
ppcomp(data_subset, legendtext = legend)

par(original_par)
#the coordinates of the subset of the dat you want to look at to assess fit
#agrA_pos1_min
index1 <- 3
legend <- c("gamma", "normal", "lognormal", "weibull") #legend text
data_subset <- list(fit_gamma[[index1]], fit_norm[[index1]], fit_lnorm[[index1]], fit_weibull[[index1]]) #to compare the normal to lognormal and weibull
par(mfcol = c(2,2), mar = c(2,2,2,1))
denscomp(data_subset, legendtext = legend)
qqcomp(data_subset, legendtext = legend)
cdfcomp(data_subset, legendtext = legend)
ppcomp(data_subset, legendtext = legend)

par(original_par)
#the coordinates of the subset of the dat you want to look at to assess fit
#agrA_pos1_empty
index1 <- 4
legend <- c("gamma", "normal", "lognormal", "weibull") #legend text
data_subset <- list(fit_gamma[[index1]], fit_norm[[index1]], fit_lnorm[[index1]], fit_weibull[[index1]]) #to compare the normal to lognormal and weibull
par(mfcol = c(2,2), mar = c(2,2,2,1))
denscomp(data_subset, legendtext = legend)
qqcomp(data_subset, legendtext = legend)
cdfcomp(data_subset, legendtext = legend)
ppcomp(data_subset, legendtext = legend)

par(original_par)

Test Normality

qqnorm_data <- function(x){
  Q <- as.data.frame(qqnorm(x, plot = FALSE))
  names(Q) <- c("xq", substitute(x))
  Q
}
theoretical_qq <- norm_data %>%
  group_by(simple_id) %>%
  do(with(., qqnorm_data(OD_adjusted)))

ggplot(data = theoretical_qq, aes(x = xq, y = OD_adjusted)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  xlab("Theoretical") +
  ylab("Sample") +
  facet_wrap(~simple_id)

null: the data are normally distributed

norm_data[, shapiro.test(.SD$OD_adjusted)$p.value, by = simple_id]
##              simple_id           V1
##  1:              wt_NA 5.067716e-14
##  2:              agrA_ 1.143929e-01
##  3:               atl_ 1.638651e-05
##  4:              icaA_ 3.867835e-03
##  5:              srtA_ 5.899671e-03
##  6:       atl_pos1_atl 3.206090e-03
##  7:     atl_pos1_empty 6.296682e-04
##  8:    icaA_pos1_empty 7.900115e-01
##  9:    srtA_pos1_empty 1.202589e-01
## 10:      icaA_pos1_ica 3.430248e-01
## 11:     srtA_pos1_srtA 3.053017e-01
## 12:  agrA_pos1_agrA_20 1.154757e-09
## 13: agrA_pos1_agrA_min 4.638858e-02
## 14:    agrA_pos1_empty 4.033927e-04
## 15:       ica_pos1_ica 2.874590e-01

Homoskedasticity

the null hypotheises of these tests are: all group variances are equal

bartlett.test(OD_adjusted ~ simple_id, data=norm_data) # Bartlett Test of Homogeneity of Variances (parametric)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  OD_adjusted by simple_id
## Bartlett's K-squared = 782.06, df = 14, p-value < 2.2e-16
fligner.test(OD_adjusted ~ simple_id, data=norm_data) # Figner-Killeen Test of Homogeneity of Variances (nonparamatric)
## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  OD_adjusted by simple_id
## Fligner-Killeen:med chi-squared = 262.56, df = 14, p-value <
## 2.2e-16